## Documentation for Preparing LobbyView Lobbying Data -------------------------
# This R script documents how I acquired and cleaned LobbyView data for 
# subsequent merging and analyses


## R Script Outputs ------------------------------------------------------------
# lv-imm-fr-99-17.RData (contains all variables used in lv-imm-fr-99-17-rep.RData)
# lv-imm-fy-99-17.RData (contains all variables used in lv-imm-fy-99-17-rep.RData, loaded in merge-data.R)
# lv-tob-fy-17.RData (loaded in merge-data.R)
# lv-bev-fy-17.RData (loaded in merge-data.R)
# lv-cdt-fy-17.RData (loaded in merge-data.R)
# sup-imm-lob-firms.RData (loaded in merge-data.R)


## Instructions ----------------------------------------------------------------
# Step 1: Adjust MAIN_DIR to where README.txt is located
# Step 2: Run entire script


## setup -----------------------------------------------------------------------
# clean slate
rm(list = ls())
date()

# load packages
pkg <- c("tidyverse")

lapply(pkg, require, character.only = TRUE)

# set main directory
MAIN_DIR <- "~/Dropbox/Research/JOP-h1b-replication"


## load data --------------------------------------------------------------------
# I queried and downloaded data on all lobbying reports that contained IMM as a 
# general issue area code between 1999 and 2017 from LobbyView's website 
# <https://www.lobbyview.org/> in October 2019. I archive and load the raw data 
# below
load(file = paste(MAIN_DIR, "lv-imm-reports-99-17-raw.RData", sep = "/"))

# some manually checked data (2000--2007)
checked.reports.1 <- read_csv(paste(MAIN_DIR, "lv-check-v1-venue-text-2000-2007.csv", 
                                   sep = "/"))

# additional manually checked data (1999-2017)
checked.reports.2 <- read_csv(paste(MAIN_DIR, "lv-check-v2-venue-text-1999-2017.csv", 
                                      sep = "/"))

# some manually checked data on report amount (1999--2007)
checked.reports.3 <- read_csv(paste(MAIN_DIR, "lv-check-v3-amount-1999-2007.csv", 
                                     sep = "/"))

# some manually checked data on venues (1999-2017) 
checked.reports.venue <- read_csv(paste(MAIN_DIR, "lv-check-venues-1999-2017.csv", 
                                         sep = "/"))

# I acquired additional 2017 report-level data for select issue area codes 
# directly from LobbyView's principal investigator In Song Kim in January 2020. 
# I archive and load the raw data below
load(file = paste(MAIN_DIR, "lv-other-reports-17-raw.RData", sep = "/"))

# I downloaded premade data on all 2017 report-issue-level data from LobbyView's 
# website in September 2020. I archived and load the raw data below
load(file = paste(MAIN_DIR, "lv-all-report-issue-17-raw.RData", sep = "/"))


## clean immigration lobbying data ---------------------------------------------
# subset data to IMM reports, exclude registration reports, create variables of interest
lob.img.reports <- lob.img.reports %>%
  filter(report_type != "REGISTRATION" & report_type != "REGISTRATION AMENDMENT") %>%
  group_by(report_id) %>%
  mutate(n_issues = n(),
         n_imm = table(issue)[names(table(issue)) == "IMM"],
         mean_img_expense_report = rep_amount / n_issues * n_imm) %>%
  arrange(client_bvdid, year, period, date_filed) %>%
  filter(issue == "IMM") %>%
  filter(!is.na(client_bvdid)) %>%
  mutate(
    # correct Hilton bvdid and legal name
    client_bvdid = ifelse(client_bvdid == "OM30099GO", "US274384691", client_bvdid),
    client_legal_name = ifelse(client_legal_name == "Salalah Beach Resort", "Hilton Worldwide Holdings Inc", client_legal_name),
    client_gvkey = ifelse(client_gvkey == "288270", "5643", client_gvkey),
    
    # correct MCKINSEY & COMPANY, INC
    client_bvdid = ifelse(client_bvdid == "PHA1998675", "US127658529L", client_bvdid),
    client_legal_name = ifelse(client_legal_name == "MCKINSEY AND CO., (PHILS.)", "MCKINSEY & COMPANY, INC", client_legal_name),
    client_naics = ifelse(client_bvdid == "US127658529L", "5416", client_naics)) 

# subset to firms
# Drop universities, gov entities, chambers, associations, charities, religious 
# organizations, unions, and non-profits

# filter by industry 
# set NAICS to exclude
naics.drop <- c("611310", "611699", # universities
                "813910", # business associations
                "813930" # Labor Unions/Orgs
)

# subset
lob.img.reports.s.1 <- lob.img.reports %>%
  filter(!client_naics %in% naics.drop)

# filter by name
# set names to exclude
drop.vec <- c("UNIVERSITY", 
              "University", 
              " UNIV$", 
              "^UNIV-", 
              "^COLLEGE ", 
              " COLLEGE ", 
              "-COLLEGE ", 
              " COLLEGE$", 
              "^COLLEGES ", 
              " COLLEGES ", 
              " COLLEGES$", 
              "^SCHOOL ", 
              " SCHOOL ", 
              " SCHOOL$", 
              "^SCHOOLS ", 
              " SCHOOLS ", 
              " SCHOOLS$", 
              "^SCHOOLS ", 
              " SCHOOLS ", 
              " SCHOOLS$", 
              "^COUNTY ", 
              " COUNTY$", 
              " COUNTY ", 
              "^CITY OF", 
              " CITY$", 
              "^CHAMBER OF", 
              " CHAMBER OF", 
              " CHAMBER$", 
              "^ASSOCIATION OF ", 
              " ASSOCIATION OF ", 
              " ASSOCIATION", 
              " Association", 
              " ASSOC", 
              "^ASSN ", 
              " ASSN ", 
              " ASSN$", 
              "^Assn ", 
              " Assn ", 
              " Assn$", 
              "^ASSN ", 
              " ASSN ", 
              " ASSN$", 
              "Association of ", 
              "ASSOCIATION OF ", 
              " ASSOCIATION$", 
              " Association$", 
              " Association ", 
              " ASSOCIATION ", 
              "-Association ", 
              "^ASSOCIATED ", 
              "^COUNCIL ", 
              " COUNCIL$", 
              " COUNCIL ", 
              "^Council ", 
              " Council$", 
              " Council ", 
              "^COUNCILS ", 
              " COUNCILS$", 
              "^COUNCIL ", 
              " COUNCIL$", 
              " COUNCIL ", 
              "^Council ", 
              " Council$", 
              " Council ", 
              " Council,", 
              "^COUNCILS ", 
              " COUNCILS$", 
              "^SOCIETY ", 
              " SOCIETY ", 
              " SOCIETY$", 
              "^CONFERENCE ", 
              " CONFERENCE ", 
              " CONFERENCE$", 
              "^CONFERENCE ", 
              " CONFERENCE ", 
              " CONFERENCE$", 
              "^AUTHORITY ", 
              " AUTHORITY ", 
              " AUTHORITY$", 
              "^AUTHORITY ", 
              " AUTHORITY ", 
              " AUTHORITY$", 
              "^COALITION ", 
              " COALITION ", 
              " COALITION$", 
              " Coalition$", 
              "^COALITION ", 
              " COALITION ", 
              " COALITION$", 
              "^ALLIANCE ", 
              " ALLIANCE ", 
              "^ALLIED ", 
              " ALLIED ", 
              " ALLIED$", 
              "^ALLIANCE ", 
              " ALLIANCE ", 
              " ALLIANCE$", 
              "^WORKERS ", 
              " WORKERS ", 
              " WORKERS$", 
              "^LEAGUE ", 
              " LEAGUE ", 
              " LEAGUE$", 
              "^COMMONWEALTH ", 
              " COMMONWEALTH ", 
              " COMMONWEALTH$", 
              "^STATE ", 
              " STATE ", 
              " STATE$", 
              "^DEPARTMENT ", 
              " DEPARTMENT ", 
              " DEPARTMENT$",  
              "^BUREAU ", 
              " BUREAU ", 
              " BUREAU$",  
              "^AGENCY ", 
              " AGENCY ", 
              " AGENCY$",  
              " DISTRICT$",  
              " ADMINISTRATION$",  
              "^FEDERATION ", 
              " FEDERATION ", 
              " FEDERATION$", 
              " Federation$", 
              " FEDERATI$", 
              "^FEDERATIONS ", 
              " FEDERATIONS ", 
              " FED OF ", 
              " FEDERATIONS$", 
              "^CENTER OF ", 
              " CENTER OF ", 
              "^CENTER FOR ", 
              "^CENTER ON ", 
              " CENTER$", 
              "AFL-CIO", 
              "^CAMPAIGN ", 
              " CAMPAIGN ", 
              " CAMPAIGN$", 
              "^ADVOCACY ", 
              " ADVOCACY ", 
              " ADVOCACY$", 
              "^UNION OF ", 
              " UNION OF ", 
              "^FOUNDATION ", 
              " FOUNDATION$", 
              "^Foundation ", 
              " Foundation$", 
              " FOUNDATION FOR ", 
              "^COMMITTEE ", 
              " COMMITTEE ", 
              " COMMITTEE$", 
              "^COMMISSION ", 
              " COMMISSION ", 
              " COMMISSION$", 
              "^ORGANIZATION ", 
              " ORGANIZATION ", 
              " ORGANIZATION$", 
              "^ORGANIZATIONS ", 
              " ORGANIZATIONS ", 
              " ORGANIZATIONS$", 
              "^CONGRESS ", 
              " CONGRESS ", 
              " CONGRESS$",  
              " FORUM$", 
              "^GOVERNMENT OF ", 
              " WORKERS OF ", 
              " WORKERS$", 
              " WORKERS INTERNATIONAL", 
              "CONSERVATIVE UNION", 
              "LIBERTIES UNION", 
              "INTERNATIONAL UNION", 
              "TAXPAYERS UNION", 
              "BORDER CONTROL", 
              " ROUNDTABLE$", 
              " TRIBE$",  
              " TRIBE OF ", 
              " LEADERS FOR ",  
              "^AMERICANS FOR ", 
              "^TAXPAYERS FOR ", 
              "Farmworker Justice", 
              "AMERICANS FOR PROSPERITY", 
              "ANGEL ISLAND IMMIGRATION STATI", 
              "MEXICAN AMERICAN LEGAL DEFENSE", 
              "CHEYENNE CATHOLIC DIOCESE", 
              "CATHOLIC CHARITIES USA", 
              "SALT LAKE CITY CORP ENGINEERING DIVISION", 
              "MADISON WI", 
              "Louisville, Kentucky", 
              "AMERICAN ACADEMY OF ", 
              "NATIONAL ACADEMY OF ", 
              "BROTHERHOOD OF", 
              "WOMEN - SOCIETA' COOPERATIVA",
              "SIERRA CLUB",
              "DIRECTORS GUILD OF AMERICA INC",
              "WESTERN GROWERS",
              "SELECT MILK PRODUCERS INC",
              "TEACH FOR AMERICA",
              "HUMAN RIGHTS FIRST INC",
              "UNITE HERE",
              "GIRL SCOUTS OF USA",
              "GIRL SCOUTS OF THE U S A",
              "KIDSPEACE CORP",
              "OSF HEALTHCARE SYSTEM",
              "PEOPLE FOR AMERICAN WAY",
              "DAIRY FARMERS OF AMERICA INC",
              "AMERICAN RIVERS",
              "BREAD FOR THE WORLD INC",
              "GTOWN JESUIT COMMUNITY HOUSE",
              "AMERICAS VOICE",
              "REFUGEE ASSISTANCE PROJECT",
              "COMMUNICATING FOR AMERICA",
              "CHURCH OF SCIENTOLOGY INTERNATIONAL",
              "DISABILITY RIGHTS EDUCATION & DEFENSE FUND, INC",
              "WESTERN UNITED DAIRYMEN",
              "CALIFORNIA RURAL LEGAL ASSISTANCE",
              "LAO FAMILY COMMUNITY OF MINNESOTA INC",
              "LEECH LAKE BAND OF OJIBWE",
              "QUEENS BOROUGH PUBLIC LIBRARY",
              "ROYAL LAO FOUNDATION INC",
              "AMERICAN MUSEUM OF NATURAL HISTORY",
              "AMERICAS POLITICAL ACTION COM",
              "GREATER CLEVELAND PARTNERSHIP",
              "CHILDREN'S DEFENSE FUND",
              "FEEDING AMERICA",
              "AMERICAN FAMILIES UNITED",
              "AARP",
              "CONSTITUTION PROJECT",
              "DANCE/USA",
              "FAMILIES USA FOUNDATION INC",
              "NATL DISABILITY RIGHTS NETWORK",
              "Refugees International",
              "Log Cabin Republicans",
              "UNITED BROTHERHOOD CARPENTERS & JOINERS",
              "NATIONAL CENTER FOR FAMILY LEARNING",
              "NUMBERSUSA COM",
              "Numbers USA",
              "AMNESTY INTERNATIONAL OF THE USA INC",
              "AMERICAN LEGION",
              "American Institute of Certified Public Accountants",
              "UNIVERSAL HUMAN RIGHTS NETWORK",
              "GAY LESBIAN & STRAIGHT EDUCATION NETWORK",
              "CONSUMERS FOR CABLE CHOICE",
              "UNITED DAIRYMAN OF ARIZONA",
              "Change to Win",
              "IMMIGRATION EQUALITY",
              "American Frozen Food Institute",
              "FLORIDA CITRUS MUTUAL",
              "EARTHJUSTICE LEGAL DEFENSE FUND",
              "Kids in Need of Defense",
              "AMERICAN MUSHROOM INSTITUTE",
              "USAction",
              "NATIONAL FISHERIES INSTITUTE",
              "AIR CONDITIONING CONTRACTORS OF AMERICA",
              "USA-ITA",
              "LAO VETERANS OF AMERICA INSTITUTE",
              "HUMANITY UNITED",
              "Immigration Centers of America- Farmville LLC",
              "Islamic Relief USA",
              "RECORDING ACADEMY",
              "Engine Advocacy",
              "NATL BOARD FOR CERTIFICATION IN OCCUPATIONAL",
              "Tile Roofing Institute",
              "ACT FOR AMERICA",
              "Leading Builders of America",
              "UNITED TO SECURE AMERICA",
              "IMMIGRATION VOICE",
              "PLANNED PARENTHOOD ACTION FUND INC",
              "U.S. Immigration Fund LLC",
              "COMPETE AMERICA",
              "PARK CITY UTAH",
              "NALEO Educational Fund",
              "Children's Defense Fund",
              "U S FUND FOR UNICEF",
              "ROCKIT FUND",
              "MEXICAN & AMERICANS THINKING TOGETHER",
              "NORTH AMERICAN INSTITUTE OF AVIATION (NAIA)",
              "California Healthcare Institute",
              "AIDS INSTITUTE",
              "AMERICAN FROZEN FOOD INSTITUTE",
              "INTERLOCKING CONCRETE PAVEMENT INSTITUTE",
              "LEGAL MOMENTUM",
              "National Employment Law Project",
              "PROFESSIONAL LANDCARE NETWORK",
              "NEGATIVE POPULATION GROWTH",
              "BOAT PEOPLE SOS",
              "WASHINGTON SOCCER PARTNERS",
              "LEADINGAGE",
              "ACCESS",
              "PRASHANT MODI",
              "LEADING BUILDERS OF AMERICA",
              "Chiefs for Change",
              "Nueva Esperanza", # non-profit
              "NUEVA ESPERANZA" # non-profit
              )

# subset
lob.img.reports.s.2 <- lob.img.reports.s.1 %>%
  filter(!str_detect(client_legal_name, paste(drop.vec, collapse = "|"))) %>%
  filter(!str_detect(client_name, paste(drop.vec, collapse = "|")))

# filter by BvD ID
# set BvD IDs to exclude
drop.bvdid.vec <- c("IN32052FI", # NETWORK is a national Catholic social justice lobby and not a firm
                    "TW96972164", # SEMI Is the global industry association representing the electronics manufacturing supply chain
                    "US123979500L", # International Longshore and Warehouse Union
                    "US127563627L", # Legal Momentum is the oldest legal advocacy group for women in the United States
                    "US127573375L", # The National Employment Law Project (NELP) is a 501(c)(3) nonprofit organization
                    "US132426979L", # PROFESSIONAL LANDCARE NETWORK
                    "US132258086L", # NEGATIVE POPULATION GROWTH
                    "US132113399L", # BOAT PEOPLE SOS
                    "US255482657L", # LEADINGAGE
                    "GB03055360", # TECHNOLOGY NETWORK AKA TECHNET is the national, bipartisan network of technology CEOs and senior executives that promotes the growth of the innovation economy.
                    "JP000030793JPN", # The Arab Community Center for Economic and Social Services (ACCESS) is a human services organization committed to the development of the Arab American community.
                    "US204905559L" # LEADING BUILDERS OF AMERICA is a trade association
                    )

# subset
lob.img.reports.s.2 <- lob.img.reports.s.2 %>%
  filter(!client_bvdid %in% drop.bvdid.vec) 

# identify amendments/terminations and exclude
lob.img.reports.s.3 <- lob.img.reports.s.2 %>%
  mutate(report_name_amend = if_else(str_detect(report_type, "AMENDMENT"), 1, 0),
         report_name_termination = if_else(str_detect(report_type, "TERMINATION$"), 1, 0),
         report_name_termination_amend = if_else(str_detect(report_type, "TERMINATION AMENDMENT"), 1, 0),
         rep_url = paste("https://soprweb.senate.gov/index.cfm?event=getFilingDetails&filingID=", 
                         report_id, "&filingTypeID=3", sep = "")) %>%
  group_by(client_bvdid, year, period) %>%
  mutate(n_rep_period = n(),
         amendment_exist = if_else(any(report_is_latest_amend == TRUE), 1, 0),
         termination_amend_exist = if_else(any(report_name_termination_amend == 1), 1, 0)) %>%
  mutate(drop_1 = if_else(n_rep_period > 1 & 
                            report_has_amendments == TRUE & 
                            report_is_latest_amend == FALSE & 
                            amendment_exist == 1, 1, 0),
         drop_2 = if_else(n_rep_period > 1 & 
                            report_has_amendments == FALSE &
                            termination_amend_exist == 1 &
                            report_name_termination == 1 &
                            report_name_termination_amend == 0, 1, 0)) %>%
  filter(drop_1 != 1 & drop_2 != 1) %>%
  ungroup()

# exclude reports that duplicate on the following vars
df.dups <- lob.img.reports.s.3 %>%
  select(client_bvdid, year, period, rep_amount, text, venue)

# subset
lob.img.reports.s.3 <- lob.img.reports.s.3[!duplicated(df.dups, fromLast = TRUE),]


# clean venues and specific issues
# rename variables in previously manually checked data and fill in values for NAs
checked.reports.1 <- checked.reports.1 %>%
  rename(text_manual = report_specific_issues, 
         venue_manual = report_venue) %>%
  mutate(venue_manual = ifelse(is.na(venue_manual), "NONE", venue_manual),
         text_manual = ifelse(is.na(text_manual), "", text_manual))

# subset vars for merging
checked.reports.1.sub <- checked.reports.1 %>%
  select(report_id, text_manual, venue_manual)

# merge in manually checked data
lob.img.reports.s.4 <- left_join(lob.img.reports.s.3, 
                                 checked.reports.1.sub, 
                                 by = "report_id")

# replace downloaded LobbyView info with manually checked info
lob.img.reports.s.4 <- lob.img.reports.s.4 %>%
  mutate(text = ifelse(!is.na(text_manual), text_manual, text),
         venue = ifelse(!is.na(venue_manual), venue_manual, venue)) %>%
  select(report_id, rep_url, 
         client_bvdid, client_gvkey, 
         client_name, client_legal_name, 
         client_naics,
         year, report_type, period, date_filed,
         issue,
         text,
         venue,
         rep_amount, n_issues, n_imm, mean_img_expense_report)

# create indicator for checked reports
checked.v1 <- unique(checked.reports.1$report_id)

lob.img.reports.s.4 <- lob.img.reports.s.4 %>%
  mutate(checked_v1 = ifelse(report_id %in% checked.v1, 1, 0))

# identify additional observations to manually check (no venue/text information)
check.df.v2 <- lob.img.reports.s.4 %>%
  filter(checked_v1 == 0) %>%
  mutate(no_venue = ifelse(venue == "" | is.na(venue) | venue == "NONE", 1, 0),
         no_text = ifelse(text == "" | is.na(text), 1, 0)) %>%
  rowwise() %>%
  mutate(n_venues = length(str_split(venue, "; ")[[1]])) %>%
  filter(no_venue == 1 | no_text == 1 | n_venues > 5) %>%
  select(report_id, rep_url, client_bvdid, client_name, client_legal_name,
         year, period, date_filed, issue, text, venue)

# # save to manually check
# write_csv(check.df.v2, 
#           path = paste(OUT_DIR, "lv-check-v2-venue-text-1999-2017.csv", sep = "/"))

# subset observations that are not IMM, have no IMM lobbying activity, are not firms, etc. after manual check
drop.vec.2 <- checked.reports.2 %>%
  filter(drop == 1) %>%
  pull(report_id)

# drop from main data
lob.img.reports.s.4 <- lob.img.reports.s.4 %>%
  filter(!report_id %in% drop.vec.2)

# subset vars for merging
checked.reports.2.sub <- checked.reports.2 %>%
  filter(drop != 1) %>%
  select(report_id, text, venue) %>%
  rename(text_manual = text, 
         venue_manual = venue) %>%
  mutate(venue_manual = ifelse(is.na(venue_manual), "NONE", venue_manual),
         text_manual = ifelse(is.na(text_manual), "", text_manual))

# merge in manually checked data
lob.img.reports.s.4 <- left_join(lob.img.reports.s.4, 
                                 checked.reports.2.sub, 
                                 by = "report_id")

# replace downloaded Lobbyview info with manually checked info
lob.img.reports.s.4 <- lob.img.reports.s.4 %>%
  mutate(text = ifelse(!is.na(text_manual), text_manual, text),
         venue = ifelse(!is.na(venue_manual), venue_manual, venue)) %>%
  select(report_id, rep_url, 
         client_bvdid, client_gvkey, 
         client_name, client_legal_name, 
         client_naics,
         year, report_type, period, date_filed,
         issue,
         text,
         venue,
         rep_amount, n_issues, n_imm, mean_img_expense_report)

# clean some text
lob.img.reports.s.4 <- lob.img.reports.s.4 %>%
  mutate(text = str_replace_all(text, "H.R. ", "HR"),
         text = str_replace_all(text, "H.R.", "HR"),
         text = str_replace_all(text, "HR ", "HR"),
         text = str_replace_all(text, "S\\.\\s+", "S"),
         text = str_replace_all(text, "S\\.", "S"),
         text = str_replace_all(text, "S\\s+", "S"))


# code lobbying target venues
# check venues
venue.vec <- unlist(strsplit(lob.img.reports.s.4$venue, ";"))
venue.vec <- str_trim(venue.vec, side = "both")
unique.venue.vec <- sort(unique(venue.vec))
unique.venue.vec
sort(table(venue.vec), decreasing = TRUE)

# set expressions
dos.vec <- c("DOS", 
             "Department of State", "DEPARTMENT OF STATE", 
             "State Department", "STATE DEPARTMENT", 
             "State - Dept of (DOS)", "State- Dept of (DOS)",
             "State Department (DOS)",
             "Department of State (DOS)",
             "^State$", " State;")
dhs.vec <- c("DHS", "Homeland Security", "HOMELAND SECURITY", 
             "Homeland Security - Dept")
uscis.vec <- c("Citizenship and Immigration Services", "Citizenship & Immigration Services", 
               "USCIS",
               "Immigration and Naturalization Service", "Immigration and Naturalization Services", 
               "Immigration & Naturalization Service", "Immigration & Naturalization Services",
               "INS")
ice.vec <- c("Immigration & Customs Enforcement", "Immigration and Customs Enforcement", 
             "ICE")
cbp.vec <- c("Customs & Border Protection", "Customs and Border Protection", 
             "CBP", "U.S. Customs Service", "U.S. Customs Service and Border Protection")
doc.vec <- c("DOC", 
             "Department of Commerce", "DEPARTMENT OF COMMERCE",
             "Commerce Department", "COMMERCE DEPARTMENT", "Commerce - Dept of (DOC)",
             "Departement of Commerce")
dol.vec <- c("DOL", 
             "Department of Labor", "DEPARTMENT OF LABOR",
             "Labor Department", "LABOR DEPARTMENT", "Labor - Dept of",
             "Labor")
dod.vec <- c("DOD", 
             "Defense", "DEFENSE")
doj.vec <- c("DOJ", 
             "Justice", "JUSTICE")
treas.vec <- c("Treasury", "TREASURY")
house.vec <- c("House of Representatives", "HOUSE OF REPRESENTATIVES", 
               "U.S. House", "U.S. HOUSE")
senate.vec <- c("Senate", "SENATE")
wh.vec <- c("White House", "WHITE HOUSE", "WH", "White House Office")
eop.vec <- c("Executive Office of the President", "EXECUTIVE OFFICE OF THE PRESIDENT", 
             "EOP", "Excutive Office of the President", "Executive Branch")
ustr.vec <- c("Trade Representative", "USTR")
omb.vec <- c("OMB", "Management & Budget", "Management and Budget")
ostp.vec <- c("OSTP", "Science and Technology Policy", "Science & Technology Policy")
nsc.vec <- c("NSC", "National Security Council")
nec.vec <- c("NEC", "National Economic Council")
hhs.vec <- c("HHS", "Health & Human Services", "Health and Human Services")
vp.vec <- c("Vice President")
usda.vec <- c("Agriculture")
cms.vec <- c("Medicare and Medicaid")
dot.vec <- c("Transportation")
edu.vec <- c("Education")
tda.vec <- c("Trade & Development", "Trade and Development")
eta.vec <- c("Employment & Training", "Employment and Training")
marad.vec <- c("Maritime Administration")
tsa.vec <- c("Transportation Security Administration")

# check unusual venue names
# identify unusual venue names
unusal.venue.vec <- unique.venue.vec[!str_detect(unique.venue.vec, paste(dos.vec, collapse = "|"))]
unusal.venue.vec <- unusal.venue.vec[!str_detect(unusal.venue.vec, paste(dhs.vec, collapse = "|"))]
unusal.venue.vec <- unusal.venue.vec[!str_detect(unusal.venue.vec, paste(uscis.vec, collapse = "|"))]
unusal.venue.vec <- unusal.venue.vec[!str_detect(unusal.venue.vec, paste(ice.vec, collapse = "|"))]
unusal.venue.vec <- unusal.venue.vec[!str_detect(unusal.venue.vec, paste(cbp.vec, collapse = "|"))]
unusal.venue.vec <- unusal.venue.vec[!str_detect(unusal.venue.vec, paste(doc.vec, collapse = "|"))]
unusal.venue.vec <- unusal.venue.vec[!str_detect(unusal.venue.vec, paste(dol.vec, collapse = "|"))]
unusal.venue.vec <- unusal.venue.vec[!str_detect(unusal.venue.vec, paste(doj.vec, collapse = "|"))]
unusal.venue.vec <- unusal.venue.vec[!str_detect(unusal.venue.vec, paste(house.vec, collapse = "|"))]
unusal.venue.vec <- unusal.venue.vec[!str_detect(unusal.venue.vec, paste(senate.vec, collapse = "|"))]
unusal.venue.vec <- unusal.venue.vec[!str_detect(unusal.venue.vec, paste(wh.vec, collapse = "|"))]
unusal.venue.vec <- unusal.venue.vec[!str_detect(unusal.venue.vec, paste(eop.vec, collapse = "|"))]
unusal.venue.vec <- unusal.venue.vec[!str_detect(unusal.venue.vec, paste(vp.vec, collapse = "|"))]
unusal.venue.vec <- unusal.venue.vec[-which(unusal.venue.vec == "NA" | unusal.venue.vec == "None" | unusal.venue.vec == "NONE")]
unusal.venue.vec <- c(unusal.venue.vec, 
                       "Agriculture",
                       "Health & Human Services",
                       "AMTRK",
                       "Army",
                       "Defense",
                       "Energy",
                       "Export",
                       "Interior",
                       "Transportation",
                       "Treasury",
                       "ACF", "USDA", "ATF", "BIA", "CMS", "CBO", "CEQ",
                       "DOD", "USDA", "DOE", "DOI", "DOT", "DEA", "EPA",
                       "EEOC", "FAA", "FBI", "FCC", "FEMA", "FTC", "FTA",
                       "FDA", "GSA", "GAO", "GPO", "HHS", "HRSA", "IRS",
                       "ITA", "MARAD", "NASA", "NEC", "NHTSA", "NOAA", 
                       "AMTRAK", "NSF", "NSA", "NSC", "NTIA", "OMB", "OPM",
                       "OSTP", "USPTO", "PBGC", "SEC", "SBA", "SSA", "TDA",
                       "TSA", "USAID", "USCG", "HUD", "FWS", "ITC", "USPS",
                       "USTR", "VA", "VETS")
unusal.venue.vec <- str_replace_all(unusal.venue.vec, "U.S. ", "")
unusal.venue.vec

# check unusual venues
check.df.venues <- lob.img.reports.s.4 %>%
  filter(!report_id %in% checked.reports.1$report_id) %>%
  filter(!report_id %in% checked.reports.2$report_id) %>%
  mutate(strange_venue = if_else(str_detect(venue, paste(unusal.venue.vec, collapse = "|")), 1, 0)) %>%
  filter(strange_venue == 1) %>%
  select(report_id, rep_url, client_bvdid, client_name, client_legal_name,
         year, period, date_filed, issue, text, venue)

# # save to manually check
# write_csv(check.df.venues, 
#           path = paste(OUT_DIR, "lv-check-venues-1999-2017.csv", sep = "/"))

# subset manually checked data for merging
checked.reports.venue.sub <- checked.reports.venue %>%
  select(report_id, venue) %>%
  rename(venue_manual = venue)

# merge in manually checked data
lob.img.reports.s.4 <- left_join(lob.img.reports.s.4, 
                                 checked.reports.venue.sub, 
                                 by = "report_id")

# replace downloaded LobbyView info with manually checked info
lob.img.reports.s.4 <- lob.img.reports.s.4 %>%
  mutate(venue = ifelse(!is.na(venue_manual), venue_manual, venue)) %>%
  select(report_id, rep_url, 
         client_bvdid, client_gvkey, 
         client_name, client_legal_name, 
         client_naics,
         year, report_type, period, date_filed,
         issue,
         text,
         venue,
         rep_amount, n_issues, n_imm, mean_img_expense_report) %>%
  mutate(venue = str_replace_all(venue, "None", "NONE"),
         venue = str_replace_all(venue, "UNDETERMINED", ""))

# check venues
venue.vec <- unlist(strsplit(lob.img.reports.s.4$venue, ";"))
venue.vec <- str_trim(venue.vec, side = "both")
unique.venue.vec <- sort(unique(venue.vec))
unique.venue.vec
sort(table(venue.vec), decreasing = TRUE)

# create variables of interest
lob.img.reports.s.5 <- lob.img.reports.s.4 %>%
  mutate(lob_dos = if_else(str_detect(venue, paste(dos.vec, collapse = "|")), 1, 0),
         lob_dhs = if_else(str_detect(venue, paste(dhs.vec, collapse = "|")), 1, 0),
         lob_uscis = if_else(str_detect(venue, paste(uscis.vec, collapse = "|")), 1, 0),
         lob_ice = if_else(str_detect(venue, paste(ice.vec, collapse = "|")), 1, 0),
         lob_cbp = if_else(str_detect(venue, paste(cbp.vec, collapse = "|")), 1, 0),
         lob_doc = if_else(str_detect(venue, paste(doc.vec, collapse = "|")), 1, 0),
         lob_dol = if_else(str_detect(venue, paste(dol.vec, collapse = "|")), 1, 0),
         lob_dod = if_else(str_detect(venue, paste(dod.vec, collapse = "|")), 1, 0),
         lob_doj = if_else(str_detect(venue, paste(doj.vec, collapse = "|")), 1, 0),
         lob_treas = if_else(str_detect(venue, paste(treas.vec, collapse = "|")), 1, 0),
         lob_house = if_else(str_detect(venue, paste(house.vec, collapse = "|")), 1, 0),
         lob_senate = if_else(str_detect(venue, paste(senate.vec, collapse = "|")), 1, 0),
         lob_wh = if_else(str_detect(venue, paste(wh.vec, collapse = "|")), 1, 0),
         lob_eop = if_else(str_detect(venue, paste(eop.vec, collapse = "|")), 1, 0),
         lob_ustr = if_else(str_detect(venue, paste(ustr.vec, collapse = "|")), 1, 0),
         lob_omb = if_else(str_detect(venue, paste(omb.vec, collapse = "|")), 1, 0),
         lob_ostp = if_else(str_detect(venue, paste(ostp.vec, collapse = "|")), 1, 0),
         lob_nsc = if_else(str_detect(venue, paste(nsc.vec, collapse = "|")), 1, 0),
         lob_nec = if_else(str_detect(venue, paste(nec.vec, collapse = "|")), 1, 0),
         lob_hhs = if_else(str_detect(venue, paste(hhs.vec, collapse = "|")), 1, 0),
         lob_vp = if_else(str_detect(venue, paste(vp.vec, collapse = "|")), 1, 0),
         lob_usda = if_else(str_detect(venue, paste(usda.vec, collapse = "|")), 1, 0),
         lob_cms = if_else(str_detect(venue, paste(cms.vec, collapse = "|")), 1, 0),
         lob_dot = if_else(str_detect(venue, paste(dot.vec, collapse = "|")), 1, 0),
         lob_edu = if_else(str_detect(venue, paste(edu.vec, collapse = "|")), 1, 0),
         lob_tda = if_else(str_detect(venue, paste(tda.vec, collapse = "|")), 1, 0),
         lob_eta = if_else(str_detect(venue, paste(eta.vec, collapse = "|")), 1, 0),
         lob_marad = if_else(str_detect(venue, paste(marad.vec, collapse = "|")), 1, 0),
         lob_tsa = if_else(str_detect(venue, paste(tsa.vec, collapse = "|")), 1, 0),
         lob_congress = ifelse(lob_house == 1 | lob_senate == 1, 1, 0)) 


# check and fill in missing report amount
# after 2008, amounts are not reported for those below $5000
# fill in upper bound
lob.img.reports.s.5 <- lob.img.reports.s.5 %>%
  mutate(rep_amount = ifelse(is.na(rep_amount) & year >= 2008, 5000, rep_amount))

# identify observations to manually check
check.df.v3 <- lob.img.reports.s.5 %>% 
  filter(is.na(rep_amount)) %>%
  select(report_id, rep_url, client_bvdid, client_name, client_legal_name,
         year, period, date_filed, issue, rep_amount)

# # save to manually check
# write_csv(check.df.v3, 
#           path = paste(OUT_DIR, "lv-check-v3-amount-1999-2008.csv", sep = "/"))

# subset manually checked data for merging
checked.reports.3.sub <- checked.reports.3 %>%
  select(report_id, rep_amount) %>%
  rename(rep_amount_manual = rep_amount)

# before 2008, amounts are not reported for those below $10000
# fill in upper bound
lob.img.reports.s.5 <- lob.img.reports.s.5 %>%
  mutate(rep_amount = ifelse(rep_amount == 0 & year <= 2007, 10000, rep_amount))

# merge in manually checked data
lob.img.reports.s.6 <- left_join(lob.img.reports.s.5, 
                                 checked.reports.3.sub, 
                                 by = "report_id")

# replace downloaded LobbyView info with manually checked info and recalculate average IMM expenses
lob.img.reports.s.6 <- lob.img.reports.s.6 %>%
  mutate(rep_amount = ifelse(!is.na(rep_amount_manual), rep_amount_manual, rep_amount),
         mean_img_expense_report = rep_amount / n_issues * n_imm) %>%
  select(-rep_amount_manual) 


# identify and update LobbyView BvD IDs
lob.img.reports.s.6 <- lob.img.reports.s.6 %>%
  mutate(
    # KPMG LLP
    client_bvdid = ifelse(client_bvdid == "CA258739946L", "US149092060L", client_bvdid),
    
    # Nu Skin
    client_bvdid = ifelse(client_bvdid == "CA261121361L", "US870565309", client_bvdid),
    
    # UNITED STEEL PAPER & FORESTRY RUBBER MANUFACTURING ENERGY
    client_bvdid = ifelse(client_bvdid == "CAD751645602D", "US145573400L", client_bvdid),
    
    # Cultural Care Au Pair
    client_bvdid = ifelse(client_bvdid == "DE5050687408", "US125976444L", client_bvdid),
    
    # SAS INSTITUTE
    client_bvdid = ifelse(client_bvdid == "FR327957106", "US149204549L", client_bvdid),
    
    # DELOITTE LLP
    client_bvdid = ifelse(client_bvdid == "GBOC303675", "US144597670L", client_bvdid),
    
    # LENNAR VENTURES LLC
    client_bvdid = ifelse(client_bvdid == "US*918398614", "US954337490", client_bvdid),
    
    # OMEGA PROTEIN INC
    client_bvdid = ifelse(client_bvdid == "US131671412L", "US760562134", client_bvdid),
    
    # TIMKEN COMPANY
    client_bvdid = ifelse(client_bvdid == "US128824052L", "US340577130", client_bvdid),
    
    # Aramark Sports & Entertainment Services, LLC
    client_bvdid = ifelse(client_bvdid == "US208236097", "US2-128577", client_bvdid),
    
    # DROPBOX INC
    client_bvdid = ifelse(client_bvdid == "US227724122L", "US260138832", client_bvdid),
    
    # SAP AMERICA INC
    client_bvdid = ifelse(client_bvdid == "US240754385L", "US128458298L", client_bvdid),
    
    # SPOTIFY USA INC
    client_bvdid = ifelse(client_bvdid == "US255387059L", "US241039772L", client_bvdid),
    
    # DISNEY WORLDWIDE SERVICES INC
    client_bvdid = ifelse(client_bvdid == "US954545390", "US164156901L", client_bvdid),
    
    # SABRE GLBL INC
    client_bvdid = ifelse(client_bvdid == "USLEI1152", "US273732821L", client_bvdid),
    client_bvdid = ifelse(client_bvdid == "US135202997L", "US273732821L", client_bvdid),
    
    # JBS USA LLC
    client_bvdid = ifelse(client_bvdid == "US239971745L", "US193429689L", client_bvdid))


# code lobbying text variables
# set keywords to detect
text.visa.vec <- c("Visa", "visa", "VISA")
text.h1b.vec <- c("H-1B", "h-1b", "H1-B", "h1-b", "H1B", "h1b")
text.hs.vec <- c("high-skill", "high skill", "High skill", "High Skill")
text.act.vec <- c("Act", "act", "ACT", 
                  "Bill", "bill", "BILL", 
                  "^HR[0-9]+", " HR[0-9]+", "HR[0-9]+$",
                  "^S[0-9]+", " S[0-9]+", "S[0-9]+$",
                  "^PL[0-9]+", " PL[0-9]+", "PL[0-9]+$",
                  "legislation", "Legislation", "LEGISLATION",
                  "provision", "Provision", "PROVISION",
                  "law", "Law", "LAW",
                  "reform", "Reform", "REFORM")

# create variables
lob.img.reports.s.7 <- lob.img.reports.s.6 %>%
  mutate(text_visa = if_else(str_detect(text, paste(text.visa.vec, collapse = "|")), 1, 0),
         text_h1b = if_else(str_detect(text, paste(text.h1b.vec, collapse = "|")), 1, 0),
         text_hs = if_else(str_detect(text, paste(text.hs.vec, collapse = "|")), 1, 0),
         text_act = if_else(str_detect(text, paste(text.act.vec, collapse = "|")), 1, 0),
         text_hs_h1b_visa = if_else(text_visa == 1 & (text_h1b == 1 | text_hs == 1), 1, 0))


# clean and save IMM report-level data
# select vars
lob.img.reports.final <- lob.img.reports.s.7 %>%
  select(
    # variables used in lv-imm-fr-99-17-rep.RData
    report_id, 
    year, 
    report_type, 
    period, 
    client_bvdid, 
    client_name,
    issue, 
    rep_amount,
    n_issues,
    n_imm, 
    mean_img_expense_report,
    venue,
    lob_dhs,
    lob_wh,
    lob_eop,
    lob_dol,
    lob_house,
    lob_senate,
    
    # additional variables used in merge-data.R
    client_naics,
    lob_uscis,
    lob_congress,
    lob_ice,
    lob_cbp,
    lob_dos,
    lob_doc,
    lob_dol,
    lob_dod,
    lob_doj,
    lob_treas,
    lob_ustr,
    lob_omb,
    lob_ostp,
    lob_nsc,
    lob_nec,
    lob_hhs,
    lob_vp,
    lob_usda,
    lob_cms,
    lob_dot,
    lob_edu,
    lob_tda,
    lob_eta,
    lob_marad,
    lob_tsa,

    text_hs_h1b_visa)

# save
save(lob.img.reports.final, 
     file = paste(MAIN_DIR, "lv-imm-fr-99-17.RData", sep = "/")) 


# aggregate up to firm-year data 
# create firm-year dataframe
# mode function for strings
calculate_mode <- function(x) {
  uniqx <- unique(na.omit(x))
  uniqx[which.max(tabulate(match(x, uniqx)))]
}

# aggregate up annual mean immigration expense and number of reports
lob.img.firms.years <- lob.img.reports.final %>%
  group_by(client_bvdid, year) %>%
  arrange(period) %>%
  summarize(
    client_name = last(client_name),
    client_naics = calculate_mode(client_naics),
    est_img_expense_fy = sum(mean_img_expense_report, na.rm = TRUE),
    n_img_rep_year = length(unique(report_id)),
    
    lob_dos = ifelse(any(lob_dos == 1, na.rm = TRUE), 1, 0),
    lob_dhs = ifelse(any(lob_dhs == 1, na.rm = TRUE), 1, 0),
    lob_uscis = ifelse(any(lob_uscis == 1, na.rm = TRUE), 1, 0),
    lob_ice = ifelse(any(lob_ice == 1, na.rm = TRUE), 1, 0),
    lob_cbp = ifelse(any(lob_cbp == 1, na.rm = TRUE), 1, 0),
    lob_doc = ifelse(any(lob_doc == 1, na.rm = TRUE), 1, 0),
    lob_dol = ifelse(any(lob_dol == 1, na.rm = TRUE), 1, 0),
    lob_dod = ifelse(any(lob_dod == 1, na.rm = TRUE), 1, 0),
    lob_doj = ifelse(any(lob_doj == 1, na.rm = TRUE), 1, 0),
    lob_treas = ifelse(any(lob_treas == 1, na.rm = TRUE), 1, 0),
    lob_house = ifelse(any(lob_house == 1, na.rm = TRUE), 1, 0),
    lob_senate = ifelse(any(lob_senate == 1, na.rm = TRUE), 1, 0),
    lob_wh = ifelse(any(lob_wh == 1, na.rm = TRUE), 1, 0),
    lob_eop = ifelse(any(lob_eop == 1, na.rm = TRUE), 1, 0),
    lob_ustr = ifelse(any(lob_ustr == 1, na.rm = TRUE), 1, 0),
    lob_omb = ifelse(any(lob_omb == 1, na.rm = TRUE), 1, 0),
    lob_ostp = ifelse(any(lob_ostp == 1, na.rm = TRUE), 1, 0),
    lob_nsc = ifelse(any(lob_nsc == 1, na.rm = TRUE), 1, 0),
    lob_nec = ifelse(any(lob_nec == 1, na.rm = TRUE), 1, 0),
    lob_hhs = ifelse(any(lob_hhs == 1, na.rm = TRUE), 1, 0),
    lob_vp = ifelse(any(lob_vp == 1, na.rm = TRUE), 1, 0),
    lob_usda = ifelse(any(lob_usda == 1, na.rm = TRUE), 1, 0),
    lob_cms = ifelse(any(lob_cms == 1, na.rm = TRUE), 1, 0),
    lob_dot = ifelse(any(lob_dot == 1, na.rm = TRUE), 1, 0),
    lob_edu = ifelse(any(lob_edu == 1, na.rm = TRUE), 1, 0),
    lob_tda = ifelse(any(lob_tda == 1, na.rm = TRUE), 1, 0),
    lob_eta = ifelse(any(lob_eta == 1, na.rm = TRUE), 1, 0),
    lob_marad = ifelse(any(lob_marad == 1, na.rm = TRUE), 1, 0),
    lob_tsa = ifelse(any(lob_tsa == 1, na.rm = TRUE), 1, 0),
    lob_congress = ifelse(any(lob_congress == 1, na.rm = TRUE), 1, 0),
    
    text_hs_h1b_visa = ifelse(any(text_hs_h1b_visa == 1, na.rm = TRUE), 1, 0)) %>%
  ungroup() 

# select variables
lob.img.firms.years <- lob.img.firms.years %>%
  select(
    # variables used in lv-imm-fy-99-17-rep.RData
    client_bvdid, 
    client_name, 
    year, 
    n_img_rep_year,
    est_img_expense_fy,
    
    # additional variables used in merge-data.R
    client_naics,
    lob_uscis,
    lob_congress,
    lob_ice,
    lob_cbp,
    lob_dos,
    lob_doc,
    lob_dol,
    lob_dod,
    lob_doj,
    lob_treas,
    lob_ustr,
    lob_omb,
    lob_ostp,
    lob_nsc,
    lob_nec,
    lob_hhs,
    lob_vp,
    lob_usda,
    lob_cms,
    lob_dot,
    lob_edu,
    lob_tda,
    lob_eta,
    lob_marad,
    lob_tsa,

    text_hs_h1b_visa)

# save
save(lob.img.firms.years, 
     file = paste(MAIN_DIR, "lv-imm-fy-99-17.RData", sep = "/"))  


## clean lobbying data for other select issue areas ----------------------------
# identify and update LobbyView BvD IDs
lv.other.reports.17 <- lv.other.reports.17.raw %>%
  mutate(
    # correct Hilton bvdid and legal name
    bvdid = ifelse(bvdid == "OM30099GO", "US274384691", bvdid),
    
    # correct MCKINSEY & COMPANY, INC
    bvdid = ifelse(bvdid == "PHA1998675", "US127658529L", bvdid),
    
    # Nueva Esperanza
    bvdid = ifelse(bvdid == "AR30-61619533-2", "US128485602L", bvdid),
    
    # KPMG LLP
    bvdid = ifelse(bvdid == "CA258739946L", "US149092060L", bvdid),
    
    # Nu Skin
    bvdid = ifelse(bvdid == "CA261121361L", "US870565309", bvdid),
    
    # UNITED STEEL PAPER & FORESTRY RUBBER MANUFACTURING ENERGY
    bvdid = ifelse(bvdid == "CAD751645602D", "US145573400L", bvdid),
    
    # Cultural Care Au Pair
    bvdid = ifelse(bvdid == "DE5050687408", "US125976444L", bvdid),
    
    # SAS INSTITUTE
    bvdid = ifelse(bvdid == "FR327957106", "US149204549L", bvdid),
    
    # DELOITTE LLP
    bvdid = ifelse(bvdid == "GBOC303675", "US144597670L", bvdid),
    
    # LENNAR VENTURES LLC
    bvdid = ifelse(bvdid == "US*918398614", "US954337490", bvdid),
    
    # OMEGA PROTEIN INC
    bvdid = ifelse(bvdid == "US131671412L", "US760562134", bvdid),
    
    # TIMKEN COMPANY
    bvdid = ifelse(bvdid == "US128824052L", "US340577130", bvdid),
    
    # Aramark Sports & Entertainment Services, LLC
    bvdid = ifelse(bvdid == "US208236097", "US2-128577", bvdid),
    
    # DROPBOX INC
    bvdid = ifelse(bvdid == "US227724122L", "US260138832", bvdid),
    
    # SAP AMERICA INC
    bvdid = ifelse(bvdid == "US240754385L", "US128458298L", bvdid),
    
    # SPOTIFY USA INC
    bvdid = ifelse(bvdid == "US255387059L", "US241039772L", bvdid),
    
    # DISNEY WORLDWIDE SERVICES INC
    bvdid = ifelse(bvdid == "US954545390", "US164156901L", bvdid),
    
    # SABRE GLBL INC
    bvdid = ifelse(bvdid == "USLEI1152", "US273732821L", bvdid),
    bvdid = ifelse(bvdid == "US135202997L", "US273732821L", bvdid),
    
    # JBS USA LLC
    bvdid = ifelse(bvdid == "US239971745L", "US193429689L", bvdid)
  )

# extract firms and save: TOB 
df.2017.tob <- lv.other.reports.17 %>%
  select(report_id, client_name, bvdid, client_gvkey, report_amount, 
         reporting_year,
         TOB) %>%
  filter(TOB != "NULL") %>%
  filter(bvdid != "NULL")

lob.tob.firms.years <- tibble(client_bvd = sort(unique(df.2017.tob$bvdid)),
                              lob_tob_2017 = 1)

# save
save(lob.tob.firms.years, 
     file = paste(MAIN_DIR, "lv-tob-fy-17.RData", sep = "/"))  

# extract firms and save: BEV 
df.2017.bev <- lv.other.reports.17 %>%
  select(report_id, client_name, bvdid, client_gvkey, report_amount, reporting_year,
         BEV) %>%
  filter(BEV != "NULL") %>%
  filter(bvdid != "NULL")

lob.bev.firms.years <- tibble(client_bvd = sort(unique(df.2017.bev$bvdid)),
                              lob_bev_2017 = 1)

# save
save(lob.bev.firms.years, 
     file = paste(MAIN_DIR, "lv-bev-fy-17.RData", sep = "/"))  


# extract firms and save: CDT
df.2017.cdt <- lv.other.reports.17 %>%
  select(report_id, client_name, bvdid, client_gvkey, report_amount, reporting_year,
         CDT) %>%
  filter(CDT != "NULL") %>%
  filter(bvdid != "NULL")

lob.cdt.firms.years <- tibble(client_bvd = sort(unique(df.2017.cdt$bvdid)),
                              lob_cdt_2017 = 1)

# save
save(lob.cdt.firms.years, 
     file = paste(MAIN_DIR, "lv-cdt-fy-17.RData", sep = "/"))  


## supplementary data on immigration lobbying firms ----------------------------
# set keywords
text.visa.vec <- c("Visa", "visa", "VISA")
text.h1b.vec <- c("H-1B", "h-1b", "H1-B", "h1-b", "H1B", "h1b")
text.hs.vec <- c("high-skill", "high skill", "High skill", "High Skill")

# create indicator variables for text and venue
df.report.2017.client.issue.target.c <- lv.all.report.issue.2017.raw %>%
  mutate(client_name = toupper(client_name),
         
         is_IMM = ifelse(issue_code == "IMM", 1, 0),
         has_text_immigration = str_detect(issue_text, "immigration|Immigration|IMMIGRATION"),
         
         has_text_visa = if_else(str_detect(issue_text, paste(text.visa.vec, collapse = "|")), 1, 0),
         has_text_h1b = if_else(str_detect(issue_text, paste(text.h1b.vec, collapse = "|")), 1, 0),
         has_text_hs = if_else(str_detect(issue_text, paste(text.hs.vec, collapse = "|")), 1, 0),
         
         has_img_text = ifelse(has_text_immigration == TRUE | has_text_visa == TRUE | has_text_h1b == TRUE, 1, 0),
         has_text_hs_h1b_visa = if_else(has_text_visa == 1 & (has_text_h1b == 1 | has_text_hs == 1), 1, 0),
         
         is_uscis = str_detect(gov_entity, "USCIS|CITIZENSHIP AND IMMIGRATION SERVICES"),
         is_dol = str_detect(gov_entity, "DEPARTMENT OF LABOR"),
         is_dhs = str_detect(gov_entity, "HOMELAND SECURITY"),
         is_congress = str_detect(gov_entity, "SENATE|HOUSE"),
         is_only_congress = str_detect(gov_entity, "\\{\"HOUSE OF REPRESENTATIVES\",SENATE\\}|\\{SENATE\\}|\\{\"HOUSE OF REPRESENTATIVES\"\\}"),
         is_wh_eop = str_detect(gov_entity, "WHITE HOUSE|EXECUTIVE OFFICE")) %>%
  arrange(report_uuid, client_uuid) %>%
  group_by(report_uuid) %>%
  mutate(has_IMM = ifelse(any(is_IMM == 1), 1, 0),
         has_IMM_or_img_text = ifelse(any(has_IMM == 1) | any(has_img_text == 1), 1, 0),
         lob_img_2017 = has_IMM_or_img_text) %>%
  ungroup() %>%
  arrange(client_uuid, report_year, report_quarter_code, report_uuid, issue_ordi)

# manually checked and identified 16 additional petitioning firms that filed 
# lobbying reports with IMM-related keywords without listing IMM
df.report.2017.client.issue.target.c %>% 
  filter(has_IMM == 1) %>%
  pull(client_name) %>%
  unique()

df.report.2017.client.issue.target.c %>% 
  filter(has_IMM_or_img_text == 1) %>%
  pull(client_name) %>%
  unique()

# set firm name vector
sup.firms.vec <- c("AURORA ORGANIC DAIRY",
                   "BOEING COMPANY",
                   "DOW CHEMICAL COMPANY",
                   "DXC TECHNOLOGY CO. AND AFFILIATES",
                   "HAWAIIAN AIRLINES",
                   "HIGH SIERRA POOLS",
                   "NASDAQ STOCK MARKET INC",
                   "NATL GRID",
                   "NORTHWESTERN MUTUAL LIFE INSURANCE COMPANY",
                   "PERDUE FARMS",
                   "PROVIDENCE HEALTH & SERVICES",
                   "RACKSPACE HOSTING",
                   "SOUTHWEST AIRLINES",
                   "STAMPS.COM",
                   "SYNCHRONY FINANCIAL",
                   "UNITED AIRLINES, INC.")

# extract corresponding unique firm ids
sup.firms.uuid <- df.report.2017.client.issue.target.c %>% 
  filter(client_name %in% sup.firms.vec) %>%
  pull(client_uuid) %>%
  unique()

# check reports by these firms
sup.firms.lob.df <- df.report.2017.client.issue.target.c %>%
  filter(client_uuid %in% sup.firms.uuid)

# create lobby measures for supplementary firms
sup.firms.lob.measures.df <- sup.firms.lob.df %>%
  group_by(client_uuid) %>%
  summarize(
    client_name = first(client_name),
    client_uuid = first(client_uuid),
    bvdid = first(bvdid),
    
    lob_img_2017 = ifelse(any(has_IMM_or_img_text == 1, na.rm = TRUE), 1, 0),
    
    has_text_hs_h1b_visa = ifelse(any(has_text_hs_h1b_visa == 1, na.rm = TRUE), 1, 0 ),
    
    lob_uscis_2017 = ifelse(any(is_uscis == TRUE, na.rm = TRUE), 1, 0),
    lob_dol_2017 = ifelse(any(is_dol == TRUE, na.rm = TRUE), 1, 0),
    lob_dhs_2017 = ifelse(any(is_dhs == TRUE, na.rm = TRUE), 1, 0),
    lob_congress_2017 = ifelse(any(is_congress == TRUE, na.rm = TRUE), 1, 0),
    lob_only_congress_2017 = ifelse(all(is_only_congress == TRUE, na.rm = TRUE), 1, 0),
    lob_wh_eop_2017 = ifelse(any(is_wh_eop == TRUE, na.rm = TRUE), 1, 0),
    
    lob_text_hs_h1b_visa_2017 = ifelse(has_text_hs_h1b_visa == 1, 1, 0),
    lob_text_hs_h1b_visa_wh_eop_2017 = ifelse(has_text_hs_h1b_visa == 1 & lob_wh_eop_2017 == 1, 1, 0),
    lob_text_hs_h1b_visa_uscis_2017 = ifelse(has_text_hs_h1b_visa == 1 & lob_uscis_2017 == 1, 1, 0),
    lob_text_hs_h1b_visa_dhs_2017 = ifelse(has_text_hs_h1b_visa == 1 & lob_dhs_2017 == 1, 1, 0),
    lob_text_hs_h1b_visa_dol_2017 = ifelse(has_text_hs_h1b_visa == 1 & lob_dol_2017 == 1, 1, 0),
    lob_text_hs_h1b_visa_only_congress_2017 = ifelse(has_text_hs_h1b_visa == 1 & lob_only_congress_2017 == 1, 1, 0)) %>%
  ungroup() 

# update BvD IDs to be consistent with petitions data
sup.firms.lob.measures.final <- sup.firms.lob.measures.df %>%
  arrange(client_name) %>%
  mutate(bvd_id = ifelse(client_name == "AURORA ORGANIC DAIRY", "USLEI36510", NA),
         bvd_id = ifelse(client_name == "BOEING COMPANY", "US910425694", bvd_id),
         bvd_id = ifelse(client_name == "DOW CHEMICAL COMPANY", "US381285128", bvd_id),
         bvd_id = ifelse(client_name == "DXC TECHNOLOGY CO. AND AFFILIATES", "US611800317", bvd_id),
         bvd_id = ifelse(client_name == "HAWAIIAN AIRLINES", "US123352362L", bvd_id),
         bvd_id = ifelse(client_name == "HIGH SIERRA POOLS", "US132244276L", bvd_id),
         bvd_id = ifelse(client_name == "NASDAQ STOCK MARKET INC", "US237501362L", bvd_id),
         bvd_id = ifelse(client_name == "NATL GRID", "US263241353L", bvd_id),
         bvd_id = ifelse(client_name == "NORTHWESTERN MUTUAL LIFE INSURANCE COMPANY", "US120109292L", bvd_id),
         bvd_id = ifelse(client_name == "PERDUE FARMS", "US125532469L", bvd_id),
         bvd_id = ifelse(client_name == "PROVIDENCE HEALTH & SERVICES", "US129056765L", bvd_id),
         bvd_id = ifelse(client_name == "RACKSPACE HOSTING", "US129331972L", bvd_id),
         bvd_id = ifelse(client_name == "SOUTHWEST AIRLINES", "US741563240", bvd_id),
         bvd_id = ifelse(client_name == "STAMPS.COM", "US770454966", bvd_id),
         bvd_id = ifelse(client_name == "SYNCHRONY FINANCIAL", "USFEB36117", bvd_id),
         bvd_id = ifelse(client_name == "UNITED AIRLINES, INC.", "US742099724", bvd_id))

# rename variables and subset
sup.imm.lob.firms <- sup.firms.lob.measures.final %>%
  rename(lob_img_2017_aug = lob_img_2017,
         lob_text_hs_h1b_visa_2017_aug = lob_text_hs_h1b_visa_2017,
         lob_text_hs_h1b_visa_uscis_2017_aug = lob_text_hs_h1b_visa_uscis_2017,
         lob_text_hs_h1b_visa_dol_2017_aug = lob_text_hs_h1b_visa_dol_2017,
         lob_text_hs_h1b_visa_dhs_2017_aug = lob_text_hs_h1b_visa_dhs_2017,
         lob_text_hs_h1b_visa_wh_eop_2017_aug = lob_text_hs_h1b_visa_wh_eop_2017,
         lob_text_hs_h1b_visa_only_congress_2017_aug = lob_text_hs_h1b_visa_only_congress_2017) %>%
  select(bvd_id, 
         client_name,
         lob_img_2017_aug,
         lob_text_hs_h1b_visa_2017_aug,
         lob_text_hs_h1b_visa_uscis_2017_aug,
         lob_text_hs_h1b_visa_dol_2017_aug,
         lob_text_hs_h1b_visa_dhs_2017_aug,
         lob_text_hs_h1b_visa_wh_eop_2017_aug,
         lob_text_hs_h1b_visa_only_congress_2017_aug)

# save
save(sup.imm.lob.firms, 
     file = paste(MAIN_DIR, "sup-imm-lob-firms.RData", sep = "/"))  

